This report was generated on 2021-08-02 10:19:21. R version: 4.1.0 on x86_64-pc-linux-gnu. For this report, CRAN packages as of 2021-06-01 were used.
…
The preprocessing and analysis of the data was conducted in the R project for statistical computing. The RMarkdown script used to generate this document and all the resulting data can be downloaded under this link. Through executing main.Rmd, the herein described process can be reproduced and this document can be generated. In the course of this, data from the folder input will be processed and results will be written to output. The html on-line version of the analysis can be accessed through this link.
The code for the herein described process can also be freely downloaded from https://github.com/fernandomillanvillalobos/datavizR.
…
abc.csv (Example)| Attribute | Type | Description |
|---|---|---|
| a | Numeric | … |
| b | Numeric | … |
| c | Numeric | … |
xyz.csv…
## [1] "package package:rstudioapi detached"
## [1] "package package:knitr detached"
# from https://mran.revolutionanalytics.com/web/packages/\
# checkpoint/vignettes/using-checkpoint-with-knitr.html
# if you don't need a package, remove it from here (commenting not sufficient)
# tidyverse: see https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/
cat("
library(rstudioapi)
library(tidyverse, warn.conflicts = FALSE) # ggplot2, dplyr, tidyr, readr, purrr, tibble, magrittr, readxl
library(scales) # scales for ggplot2
library(jsonlite) # json
library(lintr) # code linting
library(sf) # spatial data handling
library(rmarkdown)
library(data.table)
library(cowplot) # theme
library(extrafont)
library(waldo) # compare
library(psych) # some useful funs
library(ggrepel) # text labels
library(skimr) # data quality
library(gapminder) #data sets
library(socviz) # book Data Visualization: A Practical...
library(RColorBrewer)
library(dichromat) # palettes for color-blind
library(ggridges) # density ridges plots
library(viridis) # colors
library(janitor)", # names
file = "manifest.R")# if checkpoint is not yet installed, install it (for people using this
# system for the first time)
if (!require(checkpoint)) {
if (!require(devtools)) {
install.packages("devtools", repos = "http://cran.us.r-project.org")
require(devtools)
}
devtools::install_github("RevolutionAnalytics/checkpoint",
ref = "v0.3.2", # could be adapted later,
# as of now (beginning of July 2017
# this is the current release on CRAN)
repos = "http://cran.us.r-project.org")
require(checkpoint)
}
# nolint start
if (!dir.exists("~/.checkpoint")) {
dir.create("~/.checkpoint")
}
# nolint end
# install packages for the specified CRAN snapshot date
checkpoint(snapshot_date = package_date,
project = path_to_wd,
verbose = T,
scanForPackages = T,
use.knitr = F,
R.version = r_version)
rm(package_date)source("manifest.R")
unlink("manifest.R")
sessionInfo()## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] janitor_2.1.0 viridis_0.6.1 viridisLite_0.4.0 ggridges_0.5.3
## [5] dichromat_2.0-0 RColorBrewer_1.1-2 socviz_1.2 gapminder_0.3.0
## [9] skimr_2.1.3 ggrepel_0.9.1 psych_2.1.3 waldo_0.2.5
## [13] extrafont_0.17 cowplot_1.1.1 data.table_1.14.0 rmarkdown_2.8
## [17] sf_0.9-8 lintr_2.0.1 jsonlite_1.7.2 scales_1.1.1
## [21] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.6 purrr_0.3.4
## [25] readr_1.4.0 tidyr_1.1.3 tibble_3.1.2 ggplot2_3.3.3
## [29] tidyverse_1.3.1 rstudioapi_0.13 checkpoint_1.0.0
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-152 fs_1.5.0 lubridate_1.7.10 httr_1.4.2
## [5] rprojroot_2.0.2 repr_1.1.3 tools_4.1.0 backports_1.2.1
## [9] bslib_0.2.5.1 utf8_1.2.1 R6_2.5.0 KernSmooth_2.23-20
## [13] DBI_1.1.1 lazyeval_0.2.2 colorspace_2.0-1 withr_2.4.2
## [17] gridExtra_2.3 tidyselect_1.1.1 mnormt_2.0.2 processx_3.5.2
## [21] compiler_4.1.0 extrafontdb_1.0 cli_2.5.0 rvest_1.0.0
## [25] xml2_1.3.2 desc_1.3.0 sass_0.4.0 classInt_0.4-3
## [29] callr_3.7.0 proxy_0.4-25 digest_0.6.27 base64enc_0.1-3
## [33] pkgconfig_2.0.3 htmltools_0.5.1.1 dbplyr_2.1.1 rlang_0.4.11
## [37] readxl_1.3.1 jquerylib_0.1.4 generics_0.1.0 magrittr_2.0.1
## [41] Rcpp_1.0.6 munsell_0.5.0 fansi_0.5.0 lifecycle_1.0.0
## [45] stringi_1.6.2 yaml_2.2.1 snakecase_0.11.0 plyr_1.8.6
## [49] grid_4.1.0 parallel_4.1.0 crayon_1.4.1 lattice_0.20-44
## [53] haven_2.4.1 hms_1.1.0 tmvnsim_1.0-2 knitr_1.33
## [57] ps_1.6.0 pillar_1.6.1 reprex_2.0.0 glue_1.4.2
## [61] evaluate_0.14 rex_1.2.0 remotes_2.3.0 modelr_0.1.8
## [65] vctrs_0.3.8 Rttf2pt1_1.3.8 cellranger_1.1.0 gtable_0.3.0
## [69] assertthat_0.2.1 xfun_0.23 broom_0.7.6 e1071_1.7-7
## [73] cyclocomp_1.1.0 class_7.3-19 units_0.7-1 ellipsis_0.3.2
# if you want to outsource logic to other script files, see README for
# further information
# Load all visualizations functions as separate scripts
knitr::read_chunk("scripts/dviz.supp.R")
source("scripts/dviz.supp.R")
knitr::read_chunk("scripts/themes.R")
source("scripts/themes.R")
knitr::read_chunk("scripts/plot_grid.R")
source("scripts/plot_grid.R")
knitr::read_chunk("scripts/align_legend.R")
source("scripts/align_legend.R")
knitr::read_chunk("scripts/label_log10.R")
source("scripts/label_log10.R")
knitr::read_chunk("scripts/outliers.R")
source("scripts/outliers.R")The group aesthetic is usually only needed when the grouping information you need to tell ggplot about is not built into the variables being mapped.
p <- ggplot(data = gapminder,
mapping = aes(x = year,
y = gdpPercap))
p + geom_line(aes(group=country))The facet_wrap() function can take a series of arguments, but the most important is the first one, which is specified using R’s “formula” syntax, which uses the tilde character, ~. Facets are usually a one-sided formula. Most of the time you will just want a single variable on the right side of the formula.
p <- ggplot(data = gapminder,
mapping = aes(x = year,
y = gdpPercap))
p + geom_line(aes(group = country)) + facet_wrap(~ continent)p <- ggplot(data = gapminder, mapping = aes(x = year, y = gdpPercap))
p + geom_line(color="gray70", aes(group = country)) +
geom_smooth(size = 1.1, method = "loess", se = FALSE) +
scale_y_log10(labels=scales::dollar) +
facet_wrap(~ continent, ncol = 5) +
labs(x = "Year",
y = "GDP per capita",
title = "GDP per capita on Five Continents")The facet_wrap() function is best used when you want a series of small multiples based on a single categorical variable. Your panels will be laid out in order and then wrapped into a grid. If you wish you can specify the number of rows or the number of columns in the resulting layout. Facets can be more complex than this. For instance, you might want to cross-classify some data by two categorical variables. In that case you should try facet_grid() instead. This function will lay out your plot in a true two-dimensional arrangement, instead of a series of panels wrapped into a grid.
p <- ggplot(data = gss_sm,
mapping = aes(x = age, y = childs))
p + geom_point(alpha = 0.2) +
geom_smooth() +
facet_grid(sex ~ race)Multipanel layouts of this kind are especially effective when used to summarize continuous variation(as in a scatterplot) across two or more categorical variables, with the categories (and hence the panels) ordered in some sensible way.
Some geoms plot our data directly on the figure, as is the case with geom_point(), which takes variables designated as x and y and plots the points on a grid. But other geoms clearly do more work on the data before it gets plotted. Every geom_ function has an associated stat_ function that it uses by default. The reverse is also the case: every stat_ function has an associated geom_ function that it will plot by default if you ask it to. Sometimes the calculations being done by the stat_ functions that work together with the geom_ functions might not be immediately obvious. When ggplot calculates the count or the proportion, it returns temporary variables that we can use as mappings in our plots.
p <- ggplot(data = gss_sm, mapping = aes(x = bigregion))
p + geom_bar() # geom_bar called the default stat_ function associated with it, stat_count().# We no longer have a count on the y-axis, but the proportions of the bars all have a value of 1, so all the bars are the same height. We want them to sum to 1, so that we get the number of observations per continent as a proportion of the total number of observations. This is a grouping issue again. In a sense, it’s the reverse of the earlier grouping problem we faced when we needed to tell ggplot that our yearly data was grouped by country.
p <- ggplot(data = gss_sm,
mapping = aes(x = bigregion))
p + geom_bar(mapping = aes(y = ..prop..))# In this case, we need to tell ggplot to ignore the x-categories when calculating denominator of the proportion, and use the total number observations instead. To do so we specify group = 1 inside the aes() call. The value of 1 is just a kind of “dummy group” that tells ggplot to use the whole dataset when establishing the denominator for its prop calculations.
p <- ggplot(data = gss_sm,
mapping = aes(x = bigregion))
p + geom_bar(mapping = aes(y = ..prop.., group = 1)) # 1 is a dummy group# Another example
p <- ggplot(data = gss_sm,
mapping = aes(x = religion, fill = religion))
p + geom_bar() + guides(fill = FALSE) # If we set guides(fill = FALSE), the legend is removedA more appropriate use of the fill aesthetic with geom_bar() is to cross-classify two categorical variables. This is the graphical equivalent of a frequency table of counts or proportions. When we cross-classify categories in bar charts, there are several ways to display the results. With geom_bar() the output is controlled by the position argument.
p <- ggplot(data = gss_sm,
mapping = aes(x = bigregion, fill = religion))
p + geom_bar() # The default output of geom_bar() is a stacked bar chart# An alternative choice is to set the position argument to "fill".
p <- ggplot(data = gss_sm,
mapping = aes(x = bigregion, fill = religion))
p + geom_bar(position = "fill") # the bars are all the same height # When we just wanted the overall proportions for one variable, we mapped group = 1 to tell ggplot to calculate the proportions with respect to the overall N.
p <- ggplot(data = gss_sm,
mapping = aes(x = bigregion, fill = religion))
p + geom_bar(position = "dodge",
mapping = aes(y = ..prop.., group = religion))# We can ask ggplot to give us a proportional bar chart of religious affiliation, and then facet that by region
p <- ggplot(data = gss_sm,
mapping = aes(x = religion))
p + geom_bar(position = "dodge",
mapping = aes(y = ..prop.., group = bigregion)) +
facet_wrap(~ bigregion, ncol = 1)A histogram is a way of summarizing a continuous variable by chopping it up into segments or “bins” and counting how many observations are found within each bin. In a bar chart, the categories are given to us going in (e.g., regions of the country, or religious affiliation). With a histogram, we have to decide how finely to bin the data. As with the bar charts, a newly-calculated variable, count, appears on the x-axis.
While histograms summarize single variables, it’s also possible to use several at once to compare distributions. We can facet histograms by some variable of interest.
# By default, the geom_histogram() function will choose a bin size for us based on a rule of thumb.
p <- ggplot(data = midwest,
mapping = aes(x = area))
p + geom_histogram()# selecting another bin size
p <- ggplot(data = midwest,
mapping = aes(x = area))
p + geom_histogram(bins = 10)oh_wi <- c("OH", "WI")
# subset the data
p <- ggplot(data = subset(midwest, subset = state %in% oh_wi), # %in% operator is a convenient way to filter on more than one termin a variable
mapping = aes(x = percollege, fill = state))
p + geom_histogram(alpha = 0.4, bins = 20)# When working with a continuous variable, an alternative to binning the data and making a histogram is to calculate a kernel density estimate of the underlying distribution.
p <- ggplot(data = midwest,
mapping = aes(x = area, fill = state, color = state))
p + geom_density(alpha = 0.3)# For geom_density(), the stat_density() function can return its default ..density.. statistic, or ..scaled.., which will give a proportional density estimate. It can also return a statistic called ..count.., which is the density times the number of points. This can be used in stacked density plots.
p <- ggplot(data = subset(midwest, subset = state %in% oh_wi),
mapping = aes(x = area, fill = state, color = state))
p + geom_density(alpha = 0.3, mapping = (aes(y = ..scaled..)))Often our data is, in effect, already a summary table. This can happen when we have computed a table of marginal frequencies or percentages from the original data. Because we are working directly with percentage values in a summary table,we no longer have any need for ggplot to count up values for us or perform any other calculations. That is, we do not need the services of any stat_ functions. We can tell geom_bar() not to do any work on the variable before plotting it. To do this we say stat = ‘identity’ in the geom_bar() call.
p <- ggplot(data = titanic,
mapping = aes(x = fate, y = percent, fill = sex))
p + geom_bar(position = "dodge", stat = "identity") + theme(legend.position = "top")# For convenience ggplot also provides a related geom, geom_col(), which has exactly the same effect but assumes that stat = "identity".
# The position argument in geom_bar() and geom_col() can also take the value of "identity". Just as stat = "identity" means “don’t do any summary calculations”, position = "identity" means “just plot the values as given”.
p <- ggplot(data = oecd_sum,
mapping = aes(x = year, y = diff, fill = hi_lo))
p + geom_col() + guides(fill = FALSE) +
labs(x = NULL, y = "Difference in Years",
title = "The US Life Expectancy Gap",
subtitle = "Difference between US and OECD
average life expectancies, 1960-2015",
caption = "Data: OECD. After a chart by Christopher Ingraham,
Washington Post, December 27th 2017.")letting the geoms (and their stat_ functions) do the work can sometimes get a little confusing. It is too easy to lose track of whether one has calculated row margins, column margins, or overall relative frequencies. A better strategy is to calculate the frequency table you want first and then plot that table. This has the benefit of allowing you do to some quick sanity checks on your tables, to make sure you haven’t made any errors.
In addition to making our code easier to read, it lets us more easily perform sanity checks on our results, so that we are sure we have grouped and summarized things in the right order.
rel_by_region <- gss_sm %>%
group_by(bigregion, religion) %>% # from outermost to innermost
summarize(N = n()) %>%
mutate(freq = N / sum(N), # calculate relative proportion
pct = round((freq*100), 0)) # calculate percentage
# Checking pct
rel_by_region %>%
group_by(bigregion) %>%
summarize(total = sum(pct))| bigregion | total |
|---|---|
| Northeast | 100 |
| Midwest | 101 |
| South | 100 |
| West | 101 |
# As a rule, dodged charts can be more cleanly expressed as faceted plots. Faceting removes the need for a legend and thus makes the chart simpler to read.
p <- ggplot(rel_by_region, aes(x = religion, y = pct, fill = religion))
p + geom_col(position = "dodge2") +
labs(x = NULL, y = "Percent", fill = "Religion") +
guides(fill = FALSE) +
coord_flip() + # flip the axis
facet_grid(~ bigregion)The variables specified in group_by() are retained in the new data frame, the variables created with summarize() are added, and all the other variables in the original data are dropped.
We generally want our plots to present data in some meaningful order. The reorder() function will do this for us. It takes two required arguments. The first is the categorical variable or factor that we want to reorder. In this case, that’s country. The second is the variable we want to reorder it by. Here that is the donation rate, donors. The third and optional argument to reorder() is the function you want to use as a summary statistic. If you give reorder() only the first two required arguments, then by default it will reorder the categories of your first variable by the mean value of the second. You can use any sensible function you like to reorder the categorical variable (e.g., median, or sd).
organdata %>% select(1:6) %>% sample_n(size = 10) # pick a sample | country | year | donors | pop | pop_dens | gdp |
|---|---|---|---|---|---|
| Germany | 1996-01-01 | 12.70 | 81915 | 22.9434501 | 22164 |
| Ireland | 1994-01-01 | 20.30 | 3590 | 5.1088658 | 15990 |
| France | NA | NA | NA | NA | NA |
| Ireland | 2002-01-01 | 21.00 | 3932 | 5.5955600 | 32571 |
| Ireland | 1996-01-01 | 16.80 | 3636 | 5.1743276 | 19245 |
| Germany | 1993-01-01 | 13.90 | 81156 | 22.7308630 | 19983 |
| Netherlands | 1998-01-01 | 13.00 | 15707 | 37.8208524 | 24780 |
| France | 1996-01-01 | 15.10 | 58026 | 10.5214869 | 21990 |
| Canada | 2002-01-01 | 13.00 | 31414 | 0.3150660 | 30429 |
| Australia | 1999-01-01 | 8.67 | 18926 | 0.2444834 | 25445 |
# dotplot
p <- ggplot(data = organdata, mapping = aes(x = year, y = donors))
p + geom_point()# lineplot
p <- ggplot(data = organdata,
mapping = aes(x = year, y = donors))
p + geom_line(aes(group = country)) + facet_wrap(~ country)# boxplot
p <- ggplot(data = organdata,
mapping = aes(x = country, y = donors))
p + geom_boxplot() +
coord_flip()# boxplot reordered
p <- ggplot(data = organdata,
mapping = aes(x = reorder(country, donors, na.rm = TRUE),
y = donors))
p + geom_boxplot() +
labs(x=NULL) +
coord_flip()# violin plot reordered and filled
p <- ggplot(data = organdata,
mapping = aes(x = reorder(country, donors, na.rm=TRUE),
y = donors, fill = world))
p + geom_violin() + labs(x=NULL) +
coord_flip() + theme(legend.position = "top")# dotplot reordered and colored
p <- ggplot(data = organdata,
mapping = aes(x = reorder(country, donors, na.rm=TRUE),
y = donors, color = world))
p + geom_point() + labs(x=NULL) +
coord_flip() + theme(legend.position = "top")# dotplot jittered, reordered and colored
p <- ggplot(data = organdata,
mapping = aes(x = reorder(country, donors, na.rm=TRUE),
y = donors, color = world))
p + geom_jitter(position = position_jitter(width=0.15)) + # to avoid overplotting
labs(x=NULL) + coord_flip() + theme(legend.position = "top")When we want to summarize a categorical variable that just has one point per category, we should use this approach as well. The result will be a Cleveland dotplot, a simple and extremely effective method of presenting data that is usually better than either a bar chart or a table. Cleveland dotplots are generally preferred to bar or column charts. When making them, put the categories on the y-axis and order them in the way that is most relevant to the numerical summary you are providing. This sort of plot is also an excellent way to summarizemodel results or any data with with error ranges.
by_country <- organdata %>%
group_by(consent_law, country) %>%
summarize(donors_mean = mean(donors, na.rm = TRUE),
donors_sd = sd(donors, na.rm = TRUE),
gdp_mean = mean(gdp, na.rm = TRUE),
health_mean = mean(health, na.rm = TRUE),
roads_mean = mean(roads, na.rm = TRUE),
cerebvas_mean = mean(cerebvas, na.rm = TRUE))
# Doing the same in another better way
by_country <- organdata %>%
group_by(consent_law, country) %>%
summarize_if(is.numeric, list(mean, sd), na.rm = TRUE) %>% # list instead funs
ungroup()
by_country # vars are named using the original variable, with the function’s name appended| consent_law | country | donors_fn1 | pop_fn1 | pop_dens_fn1 | gdp_fn1 | gdp_lag_fn1 | health_fn1 | health_lag_fn1 | pubhealth_fn1 | roads_fn1 | cerebvas_fn1 | assault_fn1 | external_fn1 | txp_pop_fn1 | donors_fn2 | pop_fn2 | pop_dens_fn2 | gdp_fn2 | gdp_lag_fn2 | health_fn2 | health_lag_fn2 | pubhealth_fn2 | roads_fn2 | cerebvas_fn2 | assault_fn2 | external_fn2 | txp_pop_fn2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Informed | Australia | 10.63500 | 18317.923 | 0.2366284 | 22178.54 | 21779.43 | 1957.500 | 1848.214 | 5.676923 | 104.87573 | 557.6923 | 16.769231 | 393.0000 | 0.8751195 | 1.1428075 | 830.89394 | 0.0107334 | 3958.506 | 4085.883 | 481.6276 | 460.0962 | 0.4245661 | 14.327316 | 82.69863 | 1.8327505 | 26.76440 | 0.0396299 |
| Informed | Canada | 13.96667 | 29607.923 | 0.2969520 | 23711.08 | 23353.07 | 2271.929 | 2163.429 | 6.676923 | 109.26011 | 422.3846 | 16.769231 | 410.6154 | 1.0485954 | 0.7511607 | 1192.74791 | 0.0119626 | 3965.847 | 4038.868 | 420.5751 | 379.1317 | 0.3919053 | 17.679258 | 38.46544 | 2.4547181 | 40.76873 | 0.0424701 |
| Informed | Denmark | 13.09167 | 5257.154 | 12.2004034 | 23722.31 | 23275.00 | 2054.071 | 1973.429 | 6.984615 | 101.63635 | 640.6923 | 12.230769 | 532.3846 | 0.7610332 | 1.4681208 | 80.60691 | 0.1870664 | 3895.685 | 4100.016 | 371.3614 | 357.7605 | 0.1519109 | 12.421001 | 46.27163 | 2.1273554 | 33.60441 | 0.0116730 |
| Informed | Germany | 13.04167 | 80254.846 | 22.4784601 | 22163.23 | 21938.36 | 2348.750 | 2256.250 | 8.142308 | 112.78873 | 706.7692 | 9.538462 | 391.3077 | 0.5508479 | 0.6111960 | 5157.63285 | 1.4445937 | 2501.344 | 2546.250 | 377.2275 | 383.7725 | 0.6244485 | 25.911094 | 126.03515 | 1.6641006 | 56.97424 | 0.0437534 |
| Informed | Ireland | 19.79167 | 3673.615 | 5.2278574 | 20824.38 | 20153.64 | 1479.929 | 1340.786 | 4.876923 | 117.77424 | 704.6923 | 8.538462 | 394.0000 | 0.8175829 | 2.4784373 | 131.59378 | 0.1872688 | 6669.580 | 6881.862 | 565.5526 | 482.4379 | 0.2862221 | 10.761587 | 87.20320 | 1.5607362 | 18.85471 | 0.0287060 |
| Informed | Netherlands | 13.65833 | 15547.692 | 37.4372557 | 23013.15 | 22553.64 | 1992.786 | 1884.857 | 5.700000 | 76.09357 | 584.9231 | 11.153846 | 285.8462 | 0.7078763 | 1.5518074 | 372.96434 | 0.8980600 | 3769.961 | 4009.418 | 417.0621 | 377.0521 | 0.3082207 | 9.930020 | 52.23259 | 1.3445045 | 18.62725 | 0.0169759 |
| Informed | United Kingdom | 13.49167 | 58186.692 | 23.9540127 | 21359.31 | 20962.50 | 1561.214 | 1463.500 | 5.761539 | 67.92936 | 707.9231 | 8.923077 | 287.9231 | 0.7047037 | 0.7751344 | 626.34567 | 0.2578509 | 3929.497 | 4056.793 | 405.0679 | 374.9447 | 0.3548203 | 10.467402 | 93.43577 | 1.6563785 | 15.14079 | 0.0075729 |
| Informed | United States | 19.98167 | 269329.769 | 2.7970428 | 29211.77 | 28699.43 | 3988.286 | 3760.429 | 5.776923 | 155.16783 | 444.3846 | 80.384615 | 530.0000 | 1.0193884 | 1.3253667 | 12544.86916 | 0.1302809 | 4571.160 | 4791.979 | 864.9320 | 807.5220 | 0.4621577 | 8.353810 | 16.04960 | 17.8724111 | 32.15587 | 0.0476772 |
| Presumed | Austria | 23.52500 | 7927.308 | 9.4530261 | 23875.85 | 23415.07 | 1875.357 | 1803.143 | 5.492308 | 149.86541 | 768.8462 | 10.923077 | 506.8462 | 0.6308434 | 2.4159037 | 109.19507 | 0.1302112 | 3342.889 | 3645.228 | 296.8980 | 316.8549 | 0.2660249 | 30.281692 | 119.64242 | 2.3259958 | 62.13540 | 0.0088249 |
| Presumed | Belgium | 21.90000 | 10153.308 | 30.6746456 | 22499.62 | 22095.93 | 1958.357 | 1862.429 | 6.188889 | 154.69504 | 593.8462 | 14.307692 | 541.6154 | 0.7880048 | 1.9357874 | 109.16378 | 0.3297999 | 3170.584 | 3400.119 | 405.1142 | 403.2977 | 0.2027588 | 20.556129 | 55.24920 | 3.6602508 | 22.87116 | 0.0084838 |
| Presumed | Finland | 18.44167 | 5111.846 | 1.5117096 | 21018.92 | 20763.00 | 1615.286 | 1559.786 | 5.861538 | 93.57447 | 771.3846 | 27.461538 | 721.9231 | 0.5869704 | 1.5264089 | 68.62561 | 0.0202944 | 3667.866 | 3651.757 | 202.9780 | 181.5384 | 0.7274825 | 19.007381 | 136.47865 | 3.2304640 | 66.87234 | 0.0079285 |
| Presumed | France | 16.75833 | 58055.692 | 10.5268708 | 22602.85 | 22210.71 | 2159.643 | 2066.429 | 7.076923 | 156.15327 | 432.6923 | 8.923077 | 602.6923 | 0.7063585 | 1.5974174 | 851.44929 | 0.1543879 | 3260.346 | 3459.035 | 397.2170 | 371.9650 | 0.2241794 | 20.063260 | 54.53345 | 1.6563785 | 46.63758 | 0.0103484 |
| Presumed | Italy | 11.10000 | 57359.692 | 19.0348750 | 21554.15 | 21194.93 | 1757.000 | 1689.071 | 5.984615 | 121.94294 | 712.1538 | 14.923077 | 368.8462 | 0.4533029 | 4.2769998 | 424.68309 | 0.1409315 | 2781.309 | 2991.191 | 271.2379 | 264.0039 | 0.4469268 | 10.157891 | 118.03237 | 5.9646394 | 42.80936 | 0.0033601 |
| Presumed | Norway | 15.44167 | 4386.231 | 1.3542765 | 26448.38 | 25769.36 | 2217.214 | 2125.429 | 6.830769 | 69.99821 | 661.6154 | 9.538462 | 423.3077 | 0.2280896 | 1.1090195 | 97.25752 | 0.0300289 | 6491.668 | 6734.625 | 606.2047 | 663.5244 | 0.3065524 | 6.676658 | 100.37890 | 2.4703368 | 43.53999 | 0.0050521 |
| Presumed | Spain | 28.10833 | 39666.231 | 7.8393310 | 16933.00 | 16584.29 | 1289.071 | 1220.071 | 5.453846 | 161.11430 | 654.7692 | 8.692308 | 376.6154 | 0.7062535 | 4.9630376 | 950.90309 | 0.1879292 | 2888.343 | 3066.466 | 265.8960 | 269.3863 | 0.1450022 | 35.251103 | 138.65013 | 0.9473309 | 35.16281 | 0.0164238 |
| Presumed | Sweden | 13.12500 | 8789.231 | 1.9533360 | 22415.46 | 22094.00 | 1951.357 | 1868.000 | 7.315385 | 72.34575 | 595.3077 | 11.153846 | 395.8462 | 0.6827600 | 1.7535030 | 113.62376 | 0.0252520 | 3213.468 | 3313.422 | 372.9790 | 329.3088 | 0.2303843 | 13.246920 | 49.68465 | 1.6756170 | 37.58733 | 0.0089211 |
| Presumed | Switzerland | 14.18250 | 7036.846 | 17.0424949 | 27233.00 | 26931.29 | 2776.071 | 2655.643 | 5.423077 | 96.38543 | 423.5385 | 10.769231 | 488.2308 | 0.9953044 | 1.7090940 | 169.77330 | 0.4111729 | 2153.454 | 2356.923 | 475.6701 | 464.0117 | 0.6043645 | 21.701876 | 72.99956 | 3.5155333 | 96.19958 | 0.0242808 |
# Cleveland dotplot reordered and colored
p <- ggplot(data = by_country,
mapping = aes(x = donors_fn1, y = reorder(country, donors_fn1),
color = consent_law))
p + geom_point(size = 3) +
labs(x = "Donor Procurement Rate",
y = "", color = "Consent Law") +
theme(legend.position="top")# Cleveland dotplot reordered, colored and faceted
p <- ggplot(data = by_country,
mapping = aes(x = donors_fn1,
y = reorder(country, donors_fn1)))
p + geom_point(size=3) +
facet_wrap(~ consent_law, scales = "free_y", ncol = 1) + # col arg to make panels appear on top of other and make y-scale free; where one axis is categorical, as here, we can free the categorical axis and leave the continuous one fixed
labs(x= "Donor Procurement Rate",
y= "")# Dot-and-whisker plot
p <- ggplot(data = by_country, mapping = aes(x = reorder(country,
donors_fn1), y = donors_fn1))
p + geom_pointrange(mapping = aes(ymin = donors_fn1 - donors_fn2, # how us a point estimate and a range around it
ymax = donors_fn1 + donors_fn2)) +
labs(x= "", y= "Donor Procurement Rate") + coord_flip()The ggrepel package provides geom_text_repel() and geom_label_repel(), two geoms that can pick out labels much more flexibly than the default geom_text(). The ggrepel package has several other useful geoms and options to aid with effectively plotting labels along with points. The performance of its labeling algorithm is consistently very good. For many purposes it will be a better first choice than geom_text().
elections_historic %>% select(2:7) | year | winner | win_party | ec_pct | popular_pct | popular_margin |
|---|---|---|---|---|---|
| 1824 | John Quincy Adams | D.-R. | 0.3218 | 0.3092 | -0.1044 |
| 1828 | Andrew Jackson | Dem. | 0.6820 | 0.5593 | 0.1225 |
| 1832 | Andrew Jackson | Dem. | 0.7657 | 0.5474 | 0.1781 |
| 1836 | Martin Van Buren | Dem. | 0.5782 | 0.5079 | 0.1420 |
| 1840 | William Henry Harrison | Whig | 0.7959 | 0.5287 | 0.0605 |
| 1844 | James Polk | Dem. | 0.6182 | 0.4954 | 0.0145 |
| 1848 | Zachary Taylor | Whig | 0.5621 | 0.4728 | 0.0479 |
| 1852 | Franklin Pierce | Dem. | 0.8581 | 0.5083 | 0.0695 |
| 1856 | James Buchanan | Dem. | 0.5878 | 0.4529 | 0.1220 |
| 1860 | Abraham Lincoln | Rep. | 0.5941 | 0.3965 | 0.1013 |
| 1864 | Abraham Lincoln | Rep. | 0.9099 | 0.5503 | 0.1008 |
| 1868 | Ulysses Grant | Rep. | 0.7279 | 0.5266 | 0.0532 |
| 1872 | Ulysses Grant | Rep. | 0.8125 | 0.5558 | 0.1180 |
| 1876 | Rutherford Hayes | Rep. | 0.5014 | 0.4792 | -0.0300 |
| 1880 | James Garfield | Rep. | 0.5799 | 0.4831 | 0.0009 |
| 1884 | Grover Cleveland | Dem. | 0.5461 | 0.4885 | 0.0057 |
| 1888 | Benjamin Harrison | Rep. | 0.5810 | 0.4780 | -0.0830 |
| 1892 | Grover Cleveland | Dem. | 0.6239 | 0.4602 | 0.0301 |
| 1896 | William McKinley | Rep. | 0.6063 | 0.5102 | 0.0431 |
| 1900 | William McKinley | Rep. | 0.6523 | 0.5164 | 0.0612 |
| 1904 | Theodore Roosevelt | Rep. | 0.7059 | 0.5642 | 0.1883 |
| 1908 | William Taft | Rep. | 0.6646 | 0.5157 | 0.0853 |
| 1912 | Woodrow Wilson | Dem. | 0.8192 | 0.4184 | 0.1444 |
| 1916 | Woodrow Wilson | Dem. | 0.5217 | 0.4924 | 0.0312 |
| 1920 | Warren Harding | Rep. | 0.7608 | 0.6032 | 0.2617 |
| 1924 | Calvin Coolidge | Rep. | 0.7194 | 0.5404 | 0.2522 |
| 1928 | Herbert Hoover | Rep. | 0.8362 | 0.5821 | 0.1741 |
| 1932 | Franklin Roosevelt | Dem. | 0.8889 | 0.5741 | 0.1776 |
| 1936 | Franklin Roosevelt | Dem. | 0.9849 | 0.6080 | 0.2426 |
| 1940 | Franklin Roosevelt | Dem. | 0.8456 | 0.5474 | 0.0996 |
| 1944 | Franklin Roosevelt | Dem. | 0.8136 | 0.5339 | 0.0750 |
| 1948 | Harry Truman | Dem. | 0.5706 | 0.4955 | 0.0448 |
| 1952 | Dwight Eisenhower | Rep. | 0.8324 | 0.5518 | 0.1085 |
| 1956 | Dwight Eisenhower | Rep. | 0.8606 | 0.5737 | 0.1540 |
| 1960 | John Kennedy | Dem. | 0.5642 | 0.4972 | 0.0017 |
| 1964 | Lyndon Johnson | Dem. | 0.9033 | 0.6105 | 0.2258 |
| 1968 | Richard Nixon | Rep. | 0.5595 | 0.4342 | 0.0070 |
| 1972 | Richard Nixon | Rep. | 0.9665 | 0.6067 | 0.2315 |
| 1976 | Jimmy Carter | Dem. | 0.5520 | 0.5008 | 0.0206 |
| 1980 | Ronald Reagan | Rep. | 0.9089 | 0.5075 | 0.0974 |
| 1984 | Ronald Reagan | Rep. | 0.9758 | 0.5877 | 0.1821 |
| 1988 | George H. W. Bush | Rep. | 0.7918 | 0.5337 | 0.0772 |
| 1992 | Bill Clinton | Dem. | 0.6877 | 0.4301 | 0.0556 |
| 1996 | Bill Clinton | Dem. | 0.7045 | 0.4923 | 0.0851 |
| 2000 | George W. Bush | Rep. | 0.5037 | 0.4787 | -0.0510 |
| 2004 | George W. Bush | Rep. | 0.5316 | 0.5073 | 0.0246 |
| 2008 | Barack Obama | Dem. | 0.6784 | 0.5293 | 0.0727 |
| 2012 | Barack Obama | Dem. | 0.6171 | 0.5106 | 0.0386 |
| 2016 | Donald Trump | Rep. | 0.5687 | 0.4625 | -0.0175 |
p_title <- "Presidential Elections: Popular & Electoral College Margins"
p_subtitle <- "1824-2016"
p_caption <- "Data for 2016 are provisional."
x_label <- "Winner's share of Popular Vote"
y_label <- "Winner's share of Electoral College Votes"
p <- ggplot(elections_historic, aes(x = popular_pct, y = ec_pct,
label = winner_label))
p + geom_hline(yintercept = 0.5, size = 1.4, color = "gray80") + # two new geoms, geom_hline() and geom_vline() to make the lines. see also geom_abline() geom that draws straight lines based on a supplied slope and intercept
geom_vline(xintercept = 0.5, size = 1.4, color = "gray80") +
geom_point() +
geom_text_repel() +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
labs(x = x_label, y = y_label, title = p_title, subtitle = p_subtitle,
caption = p_caption)Sometimes we want to pick out some points of interest in the data without labeling every single item. Alternatively, we can pick out specific points by creating a dummy variable in the data set just for this purpose.
p <- ggplot(data = by_country,
mapping = aes(x = gdp_fn1, y = health_fn1))
# Using subset to filter the data
p + geom_point() +
geom_text_repel(data = subset(by_country, gdp_fn1 > 25000),
mapping = aes(label = country))p <- ggplot(data = by_country,
mapping = aes(x = gdp_fn1, y = health_fn1))
p + geom_point() +
geom_text_repel(data = subset(by_country,
gdp_fn1 > 25000 | health_fn1 < 1500 |
country %in% "Belgium"),
mapping = aes(label = country))# Creating a dummy variable to subset the data
organdata$ind <- organdata$ccode %in% c("Ita", "Spa") &
organdata$year > 1998
p <- ggplot(data = organdata,
mapping = aes(x = roads,
y = donors, color = ind))
p + geom_point() +
geom_text_repel(data = subset(organdata, ind),
mapping = aes(label = ccode)) +
guides(label = FALSE, color = FALSE)Sometimes we want to annotate the figure directly.We use annotate() for this purpose. We will tell annotate() to use a text geom temporarily taking advantage of their features in order to place something on the plot. The annotate() function can work with other geoms, too. The most obvious use-case is putting arbitrary text on the plot.
p <- ggplot(data = organdata, mapping = aes(x = roads, y = donors))
p + geom_point() + annotate(geom = "text", x = 91, y = 33,
label = "A surprisingly high \n recovery rate.",
hjust = 0)Learning about new geoms extended what we have seen already. Each geom makes a different type of plot. Different plots require different mappings in order to work, and so each geom_ function takes mappings tailored to the kind of graph it draws. You can’t use geom_point() to make a scatterplot without supplying an x and a y mapping, for example. Using geom_histogram() only requires you to supply an x mapping. Similarly, geom_pointrange() requires ymin and ymax mappings in order to know where to draw the lineranges it makes. A geom_ function may take optional arguments, too. When using geom_boxplot() you can specify what the outliers look like using arguments like outlier.shape and outlier.color.
Now we’ll make use of new functions for controlling some aspects of the appearance of our graph.
Consistent with ggplot’s overall approach, adjusting some visible feature of the graph means first thinking about the relationship that the feature has with the underlying data. Roughly speaking, if the change you want to make will affect the substantive interpretation of any particular geom, then most likely you will either be mapping an aesthetic to a variable using that geom’s aes() function, or you will be specifying a change via some scale_ function. If the change you want to make does not affect the interpretation of a given geom_, then most likely you will either be setting a variable inside the geom_ function, or making a cosmetic change via the theme() function.
p <- ggplot(data = organdata,
mapping = aes(x = roads,
y = donors,
color = world))
p + geom_point()Scales and guides are closely connected, which can make things confusing. The guide provides information about the scale, such as in a legend or colorbar. Thus, it is possible to make adjustments to guides from inside the various scale_ functions. More often it is easier to use the guides() function directly.
A plot with three aesthetic mappings. The variable roads is mapped to x; donors is mapped to y; and world is mapped to color. The x and y scales are both continuous, running smoothly from just under the lowest value of the variable to just over the highest value. Various labeled tick marks orient the reader to the values on each axis. The color mapping also has a scale. The world measure is an unordered categorical variable, so its scale is discrete. It takes one of four values, each represented by a different color.
Along with color, mappings like fill, shape, and size will have scales that we might want to customize or adjust. We could have mapped world to shape instead of color. In that case our four-category variable would have a scale consisting of four different shapes. Scales for these mappings may have labels, axis tick marks at particular positions, or specific colors or shapes. If we want to adjust them, we use one of the scale_ functions.
Many different kinds of variable can be mapped. More often than not x and y are continuous measures. But they might also easily be discrete, as when we mapped country names to the y axis in our boxplots and dotplots. An x or y mapping can also be defined as a transformation onto a log scale, or as a special sort of number value like a date. Similarly, a color or a fill mapping can be discrete and unordered, as with our world variable, or discrete and ordered, as with letter grades in an exam. A color or fill mapping can also be a continuous quantity, represented as a gradient running smoothly from a low to a high value. Finally, both continuous gradients and ordered discrete values might have some defined neutral midpoint with extremes diverging in both directions.
Because we have several potential mappings, and each mapping might be to one of several different scales, we end up with a lot of individual scale_ functions. Each deals with one combination of mapping and scale. They are named according to a consistent logic: *scale_
If you want to adjust the labels or tick marks on a scale, you will need to know which mapping it is for and what sort of scale it is. Then you supply the arguments to the appropriate scale function. For example, we can change the x-axis of the previous plot to a log scale, and then also change the position and labels of the tick marks on the y-axis.
p <- ggplot(data = organdata,
mapping = aes(x = roads,
y = donors,
color = world))
p + geom_point() +
scale_x_log10() +
scale_y_continuous(breaks = c(5, 15, 25),
labels = c("Five", "Fifteen", "Twenty Five"))The same applies to mappings like color and fill. Here the available scale_ functions include ones that deal with continuous, diverging, and discrete variables, as well as others that we will encounter later when we discuss the use of color and color palettes in more detail. When working with a scale that produces a legend, we can also use this its scale_ function to specify the labels in the key. To change the title of the legend, however, we use the labs() function, which lets us label all the mappings.
p <- ggplot(data = organdata,
mapping = aes(x = roads,
y = donors,
color = world))
p + geom_point() +
scale_color_discrete(labels =
c("Corporatist", "Liberal",
"Social Democratic", "Unclassified")) +
labs(x = "Road Deaths",
y = "Donor Procurement",
color = "Welfare State")If we want to move the legend somewhere else on the plot, we are making a purely cosmetic decision and that is the job of the theme() function. As we have already seen, adding + theme(legend.position = “top”) will move the legend as instructed. Finally, to make the legend disappear altogether, we tell ggplot that we do not want a guide for that scale.
We will use scale_ functions fairly regularly to make small adjustments to the labels and axes of our graphs. And we will occasionally use the theme() function to make some cosmetic adjustments.
p <- ggplot(data = organdata,
mapping = aes(x = roads,
y = donors,
color = world))
p + geom_point() +
labs(x = "Road Deaths",
y = "Donor Procurement") +
guides(color = FALSE)# Progressive enhancements of the same plot
# v1
p <- ggplot(data = subset(asasec, Year == 2014),
mapping = aes(x = Members, y = Revenues, label = Sname))
p + geom_point() + geom_smooth()## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# v2
p <- ggplot(data = subset(asasec, Year == 2014),
mapping = aes(x = Members, y = Revenues, label = Sname))
p + geom_point(mapping = aes(color = Journal)) +
geom_smooth(method = "lm")# v3:
p0 <- ggplot(data = subset(asasec, Year == 2014),
mapping = aes(x = Members, y = Revenues, label = Sname))
p1 <- p0 + geom_smooth(method = "lm", se = FALSE, color = "gray80") +
geom_point(mapping = aes(color = Journal))
# v4
p2 <- p1 + geom_text_repel(data=subset(asasec,
Year == 2014 & Revenues > 7000),
size = 2)
p2# v5
p3 <- p2 + labs(x="Membership",
y="Revenues",
color = "Section has own Journal",
title = "ASA Sections",
subtitle = "2014 Calendar year.",
caption = "Source: ASA annual report.")
p4 <- p3 + scale_y_continuous(labels = scales::dollar) +
theme(legend.position = "bottom")
p4You should choose a color palette in the first place based on its ability to express the data you are plotting. Take care to choose a palette that reflects the structure of your data. Separate from these mapping issues, there are considerations about which colors in particular to choose. In general, the default color palettes that ggplot makes available are well-chosen for their perceptual properties and aesthetic qualities. We can also use color and color layers as device for emphasis, to highlight particular data points or parts of the plot, perhaps in conjunction with other features.
We choose color palettes for mappings through one of the scale_ functions for color or fill. While it is possible to very finely control the look of your color schemes by varying the hue, chroma, and luminance of each color you use via scale_color_hue(), or scale_fill_hue(), in general this is not recommended. Instead you should use the RColorBrewer package to make a wide range of named color palettes available to you. When used in conjunction with ggplot, you access these colors by specifying the scale_color_brewer() or scale_fill_brewer() functions, depending on the aesthetic you are mapping.
You can also specify colors manually, via scale_color_manual() or scale_fill_manual(). These functions take a value argument that can be specified as vector of color names or color values that R knows about. The ability to manually specify colors can be useful when the meaning of a category itself has a strong color association. R knows many color names (like red, and green, and cornflowerblue. Try demo(‘colors’) for an overview. Alternatively, color values can be specified via their hexadecimal RGB value. This is a way of encoding color values in the RGB colorspace, where each channel can take a value from 0 to 255 like this. A color hex value begins with a hash or pound character, #, followed by three pairs of hexadecimal or “hex” numbers. Hex values are in Base 16, with the first six letters of the alphabet standing for the numbers 10 to 15. This allows a two-character hex number to range from 0 to 255. You read them as #rrggbb, where rr is the two-digit hex code for the red channel, gg for the green channel, and bb for the blue channel. So #CC55DD translates in decimal to CC = 204 (red), 55 = 85 (green), and DD = 221 (blue). It gives a strong pink color.
If we are serious about using a safe palette for color-blind viewers, we should investigate the dichromat package (The colorblindr package has similar functionality) instead. It provides a range of safe palettes and some useful functions for helping you approximately see what your current palette might look like to a viewer with one of several different kinds of color blindness.
p <- ggplot(data = organdata,
mapping = aes(x = roads, y = donors, color = world))
p + geom_point(size = 2) + scale_color_brewer(palette = "Set2") +
theme(legend.position = "top")p + geom_point(size = 2) + scale_color_brewer(palette = "Pastel2") +
theme(legend.position = "top")p + geom_point(size = 2) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "top")# Defining your own palette
cb_palette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
p4 + scale_color_manual(values = cb_palette)# Setting default color palette
Default <- brewer.pal(5, "Set2")
# safety colors from dichromat
types <- c("deutan", "protan", "tritan")
names(types) <- c("Deuteronopia", "Protanopia", "Tritanopia")
color_table <- types %>%
purrr::map(~ dichromat(Default, .x)) %>%
as_tibble() %>%
add_column(Default, .before = TRUE)
color_table| Default | Deuteronopia | Protanopia | Tritanopia |
|---|---|---|---|
| #66C2A5 | #AEAEA7 | #BABAA5 | #82BDBD |
| #FC8D62 | #B6B661 | #9E9E63 | #F29494 |
| #8DA0CB | #9C9CCB | #9E9ECB | #92ABAB |
| #E78AC3 | #ACACC1 | #9898C3 | #DA9C9C |
| #A6D854 | #CACA5E | #D3D355 | #B6C8C8 |
color_comp(color_table)Aside from mapping variables directly, color is also very useful when we want to pick out or highlight some aspect of our data. In cases like this that the layered approach of ggplot can really work to our advantage.
We will build up a plot of data about the 2016 US general election. It is contained in the county_data object in the socviz library. We begin by defining a blue and red color for the Democrats and Republicans. Then we create the basic setup and first layer of the plot. We subset the data, including only counties with a value of “No” on the flipped variable. We set the color of geom_point() to be a light gray, as it will form the background layer of the plot. And we apply a log transformation to the x-axis scale.
In the next step we add a second geom_point() layer. Here we start with the same dataset but extract a complementary subset from it. This time we choose the “Yes” counties on the flipped variable. The x and y mappings are the same, but we add a color scale for these points, mapping the partywinner16 variable to the color aesthetic. Then we specify a manual color scale with scale_color_manual(), where the values are the blue and red party_colors we defined above.
The next layer sets the y-axis scale and the labels.
Finally, we add a third layer using the geom_text_repel() function. Once again we supply a set of instructions to subset the data for this text layer. We are interested in the flipped counties that have with a relatively high percentage of African-American residents.
# Democrat Blue and Republican Red
party_colors <- c("#2E74C0", "#CB454A")
p0 <- ggplot(data = subset(county_data,
flipped == "No"),
mapping = aes(x = pop,
y = black/100))
p1 <- p0 + geom_point(alpha = 0.15, color = "gray50") +
scale_x_log10(labels=scales::comma)
p1p2 <- p1 + geom_point(data = subset(county_data,
flipped == "Yes"),
mapping = aes(x = pop, y = black/100,
color = partywinner16)) +
scale_color_manual(values = party_colors)
p2p3 <- p2 + scale_y_continuous(labels=scales::percent) +
labs(color = "County flipped to ... ",
x = "County Population (log scale)",
y = "Percent Black Population",
title = "Flipped counties, 2016",
caption = "Counties in gray did not flip.")
p3p4 <- p3 + geom_text_repel(data = subset(county_data,
flipped == "Yes" &
black > 25),
mapping = aes(x = pop,
y = black/100,
label = state), size = 2)
p4 + theme_minimal() +
theme(legend.position="top")If we want to change the overall look of it all at once, we can do that using ggplot’s theme engine. Themes can be turned on or off using the theme_set() function. It takes the name of a theme (which will itself be a function) as an argument.
Internally, theme functions are a set of detailed instructions to turn on, turn off, or modify a large number of graphical elements on the plot. Once set, a theme applies to all subsequent plots and it remains active until it is replaced by a different theme. This be done either through the use of another theme_set() statement, or on a per-plot basis by adding the theme function to the end of the plot: p4 + theme_gray() would temporarily override the generally active theme for the p4 object only. You can still use the theme() function to fine-tune any aspect of your plot, as seen above with the relocation of the legend to the top of the graph.
The ggplot library comes with several built-in themes, including theme_minimal() and theme_classic(), with theme_gray() or theme_grey() as the default. If these are not to your taste, install the ggthemes library for many more options.
You can define your own themes either entirely from scratch, or by starting with one you like and making adjustments from there.
Wilke’s cowplot package, for instance, contains a well-developed theme suitable for figures whose final destination is a journal article. Bob Rudis’s hrbrthemes package, meanwhile, has a distinctive and compact look and feel that takes advantage of some freely-available typefaces.
The theme() function allows you to exert very fine-grained control over the appearance of all kinds of text and graphical elements in a plot.
# theme_set(theme_bw())
# p4 + theme(legend.position="top")
#
# theme_set(theme_dark())
# p4 + theme(legend.position="top")
#
# theme_set(theme_economist())
# p4 + theme(legend.position="top")
# theme_set(theme_wsj())
p4 + theme(plot.title = element_text(size = rel(0.6)),
legend.title = element_text(size = rel(0.35)),
plot.caption = element_text(size = rel(0.35)),
legend.position = "top")p4 + theme(legend.position = "top")p4 + theme(legend.position = "top",
plot.title = element_text(size=rel(2),
lineheight=.5,
family="Times",
face="bold.italic",
colour="orange"),
axis.text.x = element_text(size=rel(1.1),
family="Courier",
face="bold",
color="purple")) ### Use Theme Elements in a Substantive Way
The gss_lon data contains information on the age of each GSS respondent for all the years in the survey since 1972. We will fill the density curves with a dark grey color, and then add an indicator of the mean age in each year, and a text layer for the label. With those in place we then adjust the detail of several theme elements, mostly to remove them. As before we use element_text() to tweak the appearance of various text elements such as titles and labels. And we also use element_blank() to remove several of them altogether. First, we need to calculate the mean age of the respondents for each year of interest. Because the GSS has been around for most (but not all) years since 1972, we will look at distributions about every four years since the beginning.
The initial p object subsets the data by the years we have chosen, and maps x to the age variable. The geom_density() call is the base layer, with arguments to turn off its default line color, set the fill to a shade of gray, and scale the y-axis between zero and one. Using our summarized dataset, the geom_vline() layer draws a vertical white line at the mean age of the distribution.
The ggridges package offers a different take on small-multiple density plots by allowing the distributions to overlap vertically to interesting effect. It is especially useful for repeated distributional measures that change in a clear direction. The expand argument in scale_y_discrete() adjusts the scaling of the y-axis slightly. The package also comes with its own theme, theme_ridges() that adjusts the labels so that they are aligned properly. The degree of overlap in the distributions is controlled by the scale argument in the geom.
Setting these thematic elements in an ad hoc way is often one of the first things people want to do when they make plot. But making small adjustments to theme elements should be the very last thing you do in the plotting process. Ideally, once you have set up a theme that works well for you, it should be something you can avoid having to do at all.
yrs <- c(seq(1972, 1988, 4), 1993, seq(1996, 2016, 4))
yrs## [1] 1972 1976 1980 1984 1988 1993 1996 2000 2004 2008 2012 2016
mean_age <- gss_lon %>%
filter(age %nin% NA && year %in% yrs) %>%
group_by(year) %>%
summarize(xbar = round(mean(age, na.rm = TRUE), 0))
mean_age| year | xbar |
|---|---|
| 1972 | 45 |
| 1973 | 44 |
| 1974 | 45 |
| 1975 | 44 |
| 1976 | 45 |
| 1977 | 45 |
| 1978 | 44 |
| 1980 | 45 |
| 1982 | 45 |
| 1983 | 44 |
| 1984 | 44 |
| 1985 | 46 |
| 1986 | 45 |
| 1987 | 45 |
| 1988 | 45 |
| 1989 | 45 |
| 1990 | 46 |
| 1991 | 46 |
| 1993 | 46 |
| 1994 | 46 |
| 1996 | 45 |
| 1998 | 46 |
| 2000 | 46 |
| 2002 | 46 |
| 2004 | 46 |
| 2006 | 47 |
| 2008 | 48 |
| 2010 | 48 |
| 2012 | 48 |
| 2014 | 49 |
| 2016 | 49 |
mean_age$y <- 0.3
yr_labs <- data.frame(x = 85, y = 0.8,
year = yrs)
# First, we create the plot structure
p <- ggplot(data = subset(gss_lon, year %in% yrs),
mapping = aes(x = age))
p1 <- p + geom_density(fill = "gray20", color = FALSE,
alpha = 0.9, mapping = aes(y = ..scaled..)) +
geom_vline(data = subset(mean_age, year %in% yrs),
aes(xintercept = xbar), color = "white", size = 0.5) +
geom_text(data = subset(mean_age, year %in% yrs),
aes(x = xbar, y = y, label = xbar), nudge_x = 7.5,
color = "white", size = 3.5, hjust = 1) +
geom_text(data = subset(yr_labs, year %in% yrs),
aes(x = x, y = y, label = year)) +
facet_grid(year ~ ., switch = "y")
# With the structure of the plot in place, we then style the elements in the way that we want, using a series of instructions to theme().
# p1 + theme_book(base_size = 10, plot_title_size = 10,
# strip_text_size = 32, panel_spacing = unit(0.1, "lines")) +
# theme(plot.title = element_text(size = 16),
# axis.text.x= element_text(size = 12),
# axis.title.y=element_blank(),
# axis.text.y=element_blank(),
# axis.ticks.y = element_blank(),
# strip.background = element_blank(),
# strip.text.y = element_blank(),
# panel.grid.major = element_blank(),
# panel.grid.minor = element_blank()) +
# labs(x = "Age",
# y = NULL,
# title = "Age Distribution of\nGSS Respondents")
# Using the ggridges package
p <- ggplot(data = gss_lon,
mapping = aes(x = age, y = factor(year, levels = rev(unique(year)),
ordered = TRUE)))
p + geom_density_ridges(alpha = 0.6, fill = "lightblue", scale = 1.5) +
scale_x_continuous(breaks = c(25, 50, 75)) +
scale_y_discrete(expand = c(0.01, 0)) +
labs(x = "Age", y = NULL,
title = "Age Distribution of\nGSS Respondents") +
theme_ridges() +
theme(title = element_text(size = 16, face = "bold")) ### Case Studies #### Two y-axes R makes it slightly tricky to draw graphs with two y-axes. In fact, ggplot rules it out of order altogether. It is possible to do it using R’s base graphics. Most of the time when people draw plots with two y-axes they want to line the series up as closely as possible because they suspect that there’s a substantive association between them. The main problem with using two y-axes is that it makes it even easier than usual to fool yourself (or someone else) about the degree of association between the variables. This is because you can adjust the scaling of the axes to relative to one another in way that moves the data series around more or less however you like.
We could use a split- or broken-axis plot to show the two series at the same time. These can be effective sometimes, and they seem to have better perceptual properties than overlayed charts with dual axes. Another compromise, if the series are not in the same units (or of widely differing magnitudes), is to rescale one of the series (e.g., by dividing or multiplying it by a thousand), or alternatively to index each of them to 100 at the start of the first period, and then plot them both. Index numbers can have complications of their own, but here they allow us use one axis instead of two, and also to calculate a sensible difference between the two series and plot that as well.
Now we have our two plots, we want to lay them out nicely. We do not want them to appear in the same plot area, but we do want to compare them. It would be possible to do this with a facet, but that would mean doing a fair amount of data munging to get all three series (the two indices and the difference between them) into the same tidy data frame. An alternative is to make two separate plots and then arrange them just as we like. The cowplot library makes things easy. It has a plot_grid() function that works much like grid.arrange() while also taking care of some fine details, including the proper alignment of axes across separate plot objects.
The broader problem with dual-axis plots of this sort is that the apparent association between these variables is probably spurious. The original plot is enabling our desire to spot patterns, but substantively it is probably the case that both of these time series are tending to increase, but are not otherwise related in any deep way. The use of dual axes is not recommended in general because is already much too easy to present spurious, or at least overconfident, associations, especially with time series data. Scatterplots can do that just fine. Even with a single series, we can make associations look steeper or flatter by fiddling with the aspect ratio. Using two y-axes gives you an extra degree of freedom to mess about with the data.
# Tidying data
head(fredts)| date | sp500 | monbase | sp500_i | monbase_i |
|---|---|---|---|---|
| 2009-03-11 | 696.68 | 1542228 | 100.0000 | 100.0000 |
| 2009-03-18 | 766.73 | 1693133 | 110.0548 | 109.7849 |
| 2009-03-25 | 799.10 | 1693133 | 114.7012 | 109.7849 |
| 2009-04-01 | 809.06 | 1733017 | 116.1308 | 112.3710 |
| 2009-04-08 | 830.61 | 1733017 | 119.2240 | 112.3710 |
| 2009-04-15 | 852.21 | 1789878 | 122.3245 | 116.0579 |
fredts_m <- fredts %>% select(date, sp500_i, monbase_i) %>%
gather(key = series, value = score, sp500_i:monbase_i)
head(fredts_m)| date | series | score |
|---|---|---|
| 2009-03-11 | sp500_i | 100.0000 |
| 2009-03-18 | sp500_i | 110.0548 |
| 2009-03-25 | sp500_i | 114.7012 |
| 2009-04-01 | sp500_i | 116.1308 |
| 2009-04-08 | sp500_i | 119.2240 |
| 2009-04-15 | sp500_i | 122.3245 |
# Plotting
p <- ggplot(data = fredts_m,
mapping = aes(x = date, y = score,
group = series,
color = series))
p1 <- p + geom_line() + theme(legend.position = "top") +
labs(x = "Date",
y = "Index",
color = "Series")
p <- ggplot(data = fredts,
mapping = aes(x = date, y = sp500_i - monbase_i))
p2 <- p + geom_line() +
labs(x = "Date",
y = "Difference")
cowplot::plot_grid(p1, p2, nrow = 2, rel_heights = c(0.75, 0.25), align = "v") # arrange the plots #### Redrawing a bad slide To redraw the chart I took the numbers from the bars on the chart together with employee data from QZ.com. Where there was quarterly data in the slide, I used the end-of-year number for employees, except for 2012. Mayer was appointed in July of 2012. Ideally we would have quarterly revenue and quarterly employee data for all years, but given that we do not, the most sensible thing to do is to keep things annualized except for the one year of interest, when Mayer arrives as CEO. It’s worth doing this because otherwise the large round of layoffs that immediately preceded her arrival would be misattributed to her tenure as CEO. The redrawing is straightforward. We could just draw a scatterplot and color the points by whether Mayer was CEO at the time. We can take a small step further by making a scatterplot but also holding on to the temporal element. We can use geom_path() and use use line segments to “join the dots” of the yearly observations in order, labeling each point with its year.
Alternatively, we can keep the analyst community happy by putting time back on the x-axis and plotting the ratio of revenue to employees on the y-axis.
headTail(yahoo)| Year | Revenue | Employees | Mayer |
|---|---|---|---|
| 2004 | 3574 | 7600 | No |
| 2005 | 5257 | 9800 | No |
| 2006 | 6425 | 11400 | No |
| 2007 | 6969 | 14300 | No |
| … | … | … | NA |
| 2012 | 4986 | 12000 | No |
| 2012 | 4986 | 11500 | Yes |
| 2013 | 4680 | 12200 | Yes |
| 2014 | 4618 | 12500 | Yes |
p <- ggplot(data = yahoo,
mapping = aes(x = Employees, y = Revenue))
p + geom_path(color = "gray80") +
geom_text(aes(color = Mayer, label = Year), # highlight points of interest
size = 3, fontface = "bold") +
theme(legend.position = "bottom") +
labs(color = "Mayer is CEO",
x = "Employees", y = "Revenue (Millions)",
title = "Yahoo Employees vs Revenues, 2004-2014") +
scale_y_continuous(labels = scales::dollar) +
scale_x_continuous(labels = scales::comma)# Alternative version
p <- ggplot(data = yahoo,
mapping = aes(x = Year, y = Revenue/Employees))
p + geom_vline(xintercept = 2012) +
geom_line(color = "gray60", size = 2) +
annotate("text", x = 2013, y = 0.44,
label = " Mayer becomes CEO", size = 2.5) +
labs(x = "Year\n",
y = "Revenue/Employees",
title = "Yahoo Revenue to Employee Ratio, 2004-2014") #### Saying no to pie
There is a reasonable amount of customization in this graph. First, the text of the facets is made bold in the theme() call. The graphical element is first named (strip.text.x) and then modified using the element_text() function. We also use a custom palette for the fill mapping, via scale_fill_brewer(). And finally we relabel the facets to something more informative than their bare variable names. This is done using the labeller argument and the as_labeller() function inside the facet_grid() call. At the beginning of the plotting code, we set up an object called f_labs, which is in effect a tiny data frame that associates new labels with the values of the type variable in studebt. We use backticks (the angled quote character located next to the ‘1’ key on US keyboards) to pick out the values we want to relabel. The as_labeller() function takes this object and uses it to create new text for the labels when facet_grid() is called.
When the categorical axis labels are long, though, I generally find it’s easier to read them on the y-axis. The colors on the graph are not encoding or mapping any information in the data that is not already taken care of by the faceting. The fill mapping is useful, but also redundant. This graph could easily be in black and white, and would be just as informative if it were.
One thing that is not emphasized in a faceted chart like this is the idea that each of the debt categories is a share or percentage of a total amount.
Instead of having separate bars distinguished by heights, we can array the percentages for each distribution proportionally within a single bar. We will make a stacked bar chart. We are careful to map the income categories in an ascending sequence of colors, and to adjust the key so that the values run from low to high, from left to right, and from yellow to purple. This is done partly by switching the fill mapping from Debt to Debtrc. The categories of the latter are the same as the former, but the sequence of income levels is coded in the order we want.
The rest of the work is done in the guides() call. We give guides() a series of instructions about the fill mapping: reverse the direction of the color coding; put the legend title above the key; put the labels for the colors below the key; widen the width of the color boxes a little, and place the whole key on a single row.
head(studebt)| Debt | type | pct | Debtrc |
|---|---|---|---|
| Under $5 | Borrowers | 20 | Under $5 |
| $5-$10 | Borrowers | 17 | $5-$10 |
| $10-$25 | Borrowers | 28 | $10-$25 |
| $25-$50 | Borrowers | 19 | $25-$50 |
| $50-$75 | Borrowers | 8 | $50-$75 |
| $75-$100 | Borrowers | 3 | $75-$100 |
# setting up some labels in advance, as we will reuse them
p_xlab <- "Amount Owed, in thousands of Dollars"
p_title <- "Outstanding Student Loans"
p_subtitle <- "44 million borrowers owe a total of $1.3 trillion"
p_caption <- "Source: FRB NY"
# a special label for the facets
f_labs <- c(`Borrowers` = "Percent of\nall Borrowers",
`Balances` = "Percent of\nall Balances")
p <- ggplot(data = studebt,
mapping = aes(x = Debt, y = pct/100, fill = type))
p + geom_bar(stat = "identity") +
scale_fill_brewer(type = "qual", palette = "Dark2") +
scale_y_continuous(labels = scales::percent) +
guides(fill = FALSE) +
theme(strip.text.x = element_text(face = "bold")) +
labs(y = NULL, x = p_xlab,
caption = p_caption,
title = p_title,
subtitle = p_subtitle) +
facet_grid(~ type, labeller = as_labeller(f_labs)) +
coord_flip()# stacked bar chart
p <- ggplot(studebt, aes(y = pct/100, x = type, fill = Debtrc)) # pct/100 to plot as pct
p + geom_bar(stat = "identity", color = "gray80") + # we set the border colors of the bars to a light gray in geom_bar() to make the bar segments easier to distinguish.
scale_x_discrete(labels = as_labeller(f_labs)) +
scale_y_continuous(labels = scales::percent) +
scale_fill_viridis(discrete = TRUE) + # using scale_fill_viridis() for the color palette
guides(fill = guide_legend(reverse = TRUE,
title.position = "top",
label.position = "bottom",
keywidth = 3,
nrow = 1)) +
labs(x = NULL, y = NULL,
fill = "Amount Owed, in thousands of dollars",
caption = p_caption,
title = p_title,
subtitle = p_subtitle) +
theme(legend.position = "top",
axis.text.y = element_text(face = "bold", hjust = 1, size = 12),
axis.ticks.length = unit(0, "cm"),
panel.grid.major.y = element_blank()) +
coord_flip()The code in this RMarkdown is linted with the lintr package, which is based on the tidyverse style guide.
# lintr::lint("main.Rmd", linters =
# lintr::with_defaults(
# commented_code_linter = NULL,
# trailing_whitespace_linter = NULL
# )
# )
# # if you have additional scripts and want them to be linted too, add them here
# lintr::lint("scripts/my_script.R")